path <- "/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/popgen_analyses/clustering_PCA/admixture/MaciRef_vcfAllfRef"
dt <- read.delim("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/messor_contig_AllGeneticData.txt") %>%
rename(Ind = sra_access_number) %>%
mutate(lineage = case_when(mitochondrial %in% "" ~ species, TRUE ~ mitochondrial))
### not very efficient function, need to be sure of each maxK per analysis
### clusType either Major or all
admixturePlot <- function(path = path, analysis = analysis, K = K, clusType = clusType,
labelSize = labelSize) {
samples <- read.table(list.files(paste0(path, "/", analysis), pattern = "_inds.txt",
recursive = F, full.names = T))$V1
lst <- list.files(paste0(path, "/", analysis, "/CLUMPAK"), pattern = "ClumppIndFile.output",
recursive = T, full.names = T)
lst <- lst[str_detect(lst, "Cluster") & !str_detect(lst, "K=1")]
dtset <- dt %>%
filter(Ind %in% samples) %>%
arrange(Ind %in% samples) %>%
mutate(order = paste0("V", 1:nrow(.)), ID = paste0(substring(caste, 1, 1),
"_", Ind)) %>%
rename(pop = lineage) %>%
select(ID, pop, order)
dtCluster <- data.frame()
for (i in lst) {
outfile <- read.table(i, header = FALSE) %>%
select(-(V1:V5))
setLabel <- gsub("/", "_", gsub("/CLUMPP.files/ClumppIndFile.output", "",
unlist(str_split(i, pattern = "CLUMPAK/"))[2]))
colnames(outfile) <- paste0("K", 1:ncol(outfile))
outfile <- outfile %>%
mutate(order = paste0("V", 1:nrow(.)), set = setLabel) %>%
left_join(dtset, by = "order") %>%
pivot_longer(-c(order, ID, pop, set), names_to = "cluster") %>%
arrange(pop)
dtCluster <- bind_rows(dtCluster, outfile)
}
clusType <- ifelse(clusType %in% "Major", "Major", "Major|Minor")
maxK <- max(str_remove(dtCluster$cluster, "K"))
K <- ifelse(K != maxK, K, maxK)
dtCluster1 <- dtCluster %>%
separate(set, into = c("kSet", "clusterType"), sep = "_", remove = F) %>%
filter(str_detect(kSet, paste(seq(1, K), collapse = "|")), str_detect(clusterType,
clusType))
p1 = ggplot(data = dtCluster1, aes(x = ID, y = value, fill = cluster)) + geom_bar(stat = "identity",
show.legend = FALSE, colour = "black", width = 1, size = 0.1) + facet_grid(set ~
pop, scales = "free", space = "free") + theme_bw() + scale_fill_iwanthue() +
theme(plot.background = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.border = element_blank(), panel.spacing = unit(0.1,
"lines"), strip.background = element_rect(fill = "gray95"), strip.text.y = element_text(size = labelSize,
angle = 0), strip.text.x = element_text(size = labelSize), axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.y = element_text(size = labelSize),
axis.text.x = element_text(angle = 30, vjust = 1.2, hjust = 1, size = labelSize),
legend.position = "none", strip.placement = "outside") + scale_y_continuous(expand = c(0,
0))
return(p1)
}
cvPlot <- function(path = path, analysis = analysis) {
dt <- list.files(paste0(path, "/", analysis, "/stdout"), pattern = "stdout",
recursive = T, full.names = T)
cvs <- data.frame()
for (cv in dt) {
cvd <- read.delim(cv)
cvs <- bind_rows(cvs, data.frame(run = word(cv, -2, sep = fixed(".")), cvError = cvd[str_detect(cvd[,
1], "CV error"), ]))
}
cvs <- cvs %>%
mutate(cvError = word(cvError, sep = ": ", 2, 2)) %>%
separate(run, into = c("K", "rep"), sep = "_")
clst <- list.files(paste0(path, "/", analysis, "/CLUMPAK"), pattern = "MCL.clusters|FilesToIndex",
recursive = T, full.names = T)
clst <- clst[!str_detect(clst, "Cluster")]
tpp <- data.frame()
for (cls in 1:(nrow(cvs)/20)) {
lss <- clst[str_detect(clst, paste0("K=", cls, "/"))]
mcl <- read.table(lss[str_detect(lss, "MCL")], sep = "\t")
clus <- str_split(mcl$V1, " ")
if (length(clus) > 1) {
names(clus) <- c("majorCluster", paste0("minorCluster", seq(1, (length(clus) -
1))))
} else {
names(clus) <- "majorCluster"
}
fti <- read.table(lss[str_detect(lss, "FilesToIndex")]) %>%
mutate(V2 = word(V2, -3, sep = fixed("."))) %>%
separate(col = V2, sep = "_", into = c("K", "rep"))
fti$cluster <- NA
for (j in names(clus)) {
fti[fti$V1 %in% clus[[j]], "cluster"] <- j
}
tpp <- bind_rows(tpp, fti)
}
cvs <- left_join(cvs, tpp, by = c("K", "rep"))
ggplot(cvs, aes(x = fct_inseq(K), y = as.numeric(cvError), color = cluster)) +
geom_boxplot(position = position_dodge(0.9)) + geom_point(position = position_dodge(0.9)) +
geom_text_repel(aes(label = rep), position = position_dodge(0.9)) + theme_bw() +
labs(x = "K", y = "CV value", title = "Variability in cross validation") +
theme(legend.position = "bottom")
}Admixture analyses were run with per analysis VCF files obtained from merging per sample VCF files. This was done to decrease time of bcftools mpileup per dataset, but to improve overlap of sites across RNAseq and WGS samples a .bed file was created with a set of SNPs including a little over 1 million sites.
The .bed file was it self created from the cleaning of a qsl2 vcf (subset of RNAseq queen samples representing the different mitochondrial lineages). The filtering of this vcf file retrieved biallelic sites (indels excluded) with Qual above 20, read depth above 10 and allowing 10% missing data across sites (script subMakeBed.sh).
Each sample VCF was called with keeping all sites, filtering (’FORMAT/DP>10 && QUAL>20) and then subsetting SNP sites from created .bed file. The final merged VCF file was then filtered for biallelic sites thinned every 500bp and allowing 25% missing data per site.
Admixture analyses were run aided by the admixturePipeline.py (https://github.com/stevemussmann/admixturePipeline) and evaluated with CLUMPAK.
Analyses were run for a dataset with all queens and seven subclades datasets the include queens and workers. These datasets focus on the three social hybridogenesis groups of species: Mbar only and Mbar with close relatives; Mebe only and Mebe with close relatives; and M. structor as well as M. structor lineages split between the two main clades: Ibericus + ponticus + mcarthuri (Mstr127) and muticus + structor3 + structor4 (Mstr346).
analysis <- "MessorQueens"
K = 6
clusType = "Major"
labelSize = 6
samples <- read.table(list.files(paste0(path, "/", analysis), pattern = "_inds.txt",
recursive = F, full.names = T))$V1
lst <- list.files(paste0(path, "/", analysis, "/CLUMPAK"), pattern = "ClumppIndFile.output",
recursive = T, full.names = T)
lst <- lst[str_detect(lst, "Cluster") & !str_detect(lst, "K=1")]
dtset <- dt %>%
filter(Ind %in% samples) %>%
arrange(Ind %in% samples) %>%
mutate(order = paste0("V", 1:nrow(.)), ID = paste0(substring(caste, 1, 1), "_",
Ind)) %>%
rename(pop = species) %>%
select(ID, pop, order)
dtCluster <- data.frame()
for (i in lst) {
outfile <- read.table(i, header = FALSE) %>%
select(-(V1:V5))
setLabel <- gsub("/", "_", gsub("/CLUMPP.files/ClumppIndFile.output", "", unlist(str_split(i,
pattern = "CLUMPAK/"))[2]))
colnames(outfile) <- paste0("K", 1:ncol(outfile))
outfile <- outfile %>%
mutate(order = paste0("V", 1:nrow(.)), set = setLabel) %>%
left_join(dtset, by = "order") %>%
pivot_longer(-c(order, ID, pop, set), names_to = "cluster") %>%
arrange(pop)
dtCluster <- bind_rows(dtCluster, outfile)
}
clusType <- ifelse(clusType %in% "Major", "Major", "Major|Minor")
maxK <- max(str_remove(dtCluster$cluster, "K"))
K <- ifelse(K != maxK, K, maxK)
dtCluster1 <- dtCluster %>%
separate(set, into = c("kSet", "clusterType"), sep = "_", remove = F) %>%
filter(str_detect(kSet, paste(seq(1, K), collapse = "|")), str_detect(clusterType,
clusType))
p1 = ggplot(data = dtCluster1, aes(x = ID, y = value, fill = cluster)) + geom_bar(stat = "identity",
show.legend = FALSE, colour = "black", width = 1, size = 0.1) + facet_grid(set ~
pop, scales = "free", space = "free") + theme_bw() + scale_fill_iwanthue() +
theme(plot.background = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.border = element_blank(), panel.spacing = unit(0.1,
"lines"), strip.text.y = element_text(size = labelSize, angle = 0), strip.text.x = element_text(size = labelSize,
angle = 90), axis.text.x = element_text(angle = 30, vjust = 1.2, hjust = 1,
size = labelSize), axis.title.x = element_blank(), axis.ticks.x = element_blank(),
axis.title.y = element_blank(), axis.text.y = element_text(size = labelSize),
legend.position = "none", strip.placement = "outside")
p1locusID <- read.delim(paste0(path, "/messorQueens/MessorQueens_filteredThin.recode.map"),
header = F)Using 33151 SNPs.
cvPlot(path, analysis)analysis <- "MbarDecCap"
admixturePlot(path, analysis, K = 4, "all", 8)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 25913 SNPs.
cvPlot(path, analysis)analysis <- "Mbar"
admixturePlot(path, analysis, K = 4, "all", 8)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 22365 SNPs.
cvPlot(path, analysis)analysis <- "MbarQueens"
admixturePlot(path, analysis, K = 2, "all", 8)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 26072 SNPs.
cvPlot(path, analysis)The analysis with just queens confirms structuring within M. barbarus even though this is not very clear in the analysis including the workers. In the the analysis with workers sample SRR4292909 is not more closely related to the other queens from the same mtDNA clade but to two others workers that occur in different geographic locations.
analysis <- "MbarCRAM"
admixturePlot(path, analysis, K = 4, "all", 8)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_cramfilteredThin.recode.map"),
header = F)Using 107880 SNPs.
cvPlot(path, analysis)analysis <- "MebeWesMin"
admixturePlot(path, analysis, K = 4, "all", 8)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 16416 SNPs.
cvPlot(path, analysis)analysis <- "Mebe"
admixturePlot(path, analysis, K = 4, "all", 8)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 14540 SNPs.
cvPlot(path, analysis)analysis <- "MebeQueens"
admixturePlot(path, analysis, K = 3, "all", 8)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 12753 SNPs.
cvPlot(path, analysis)The analysis with just queens supports the lack of structure for M. ebeninus lineages, i.e. M. ebeninus samples represent a panmitic population.
analysis <- "Mstr"
admixturePlot(path, analysis, K = 8, "Major", 7)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 28361 SNPs.
cvPlot(path, analysis)analysis <- "Mstr127"
admixturePlot(path, analysis, K = 5, "all", 7)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 26095 SNPs.
cvPlot(path, analysis)analysis <- "Mstr346"
admixturePlot(path, analysis, K = 5, "all", 8)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 41311 SNPs.
cvPlot(path, analysis)analysis <- "MstrQueens"
admixturePlot(path, analysis, K = 7, "all", 7)locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
header = F)Using 31758 SNPs.
cvPlot(path, analysis)